#linear regression models - revised  data

# Using pc assigned to nearest station (time) - unweighted population - 1 min work pop
result1 <- lm(formula = log(entex1112) ~ log1p(n1t_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result1)
AIC(result1)

# Using pc assigned to nearest station (time) - unweighted population - 2 min work pop
result2 <- lm(formula = log(entex1112) ~ log1p(n1t_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result2)
AIC(result2)

# Using pc assigned to nearest station (time) - unweighted population - 3 min work pop
result3 <- lm(formula = log(entex1112) ~ log1p(n1t_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_3m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result3)
AIC(result3)

# Using pc assigned to nearest station (time) - unweighted population - 4 min work pop
result4 <- lm(formula = log(entex1112) ~ log1p(n1t_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_4m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result4)
AIC(result4)


# Using pc assigned to nearest station (distance) - unweighted population - 1 min work pop
result5 <- lm(formula = log(entex1112) ~ log1p(n1d_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result5)
AIC(result5)

# Using pc assigned to nearest station (distance) - unweighted population - 2 min work pop
result6 <- lm(formula = log(entex1112) ~ log1p(n1d_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result6)
AIC(result6)

# Using pc assigned to nearest station (distance) - unweighted population - 3 min work pop
result7 <- lm(formula = log(entex1112) ~ log1p(n1d_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_3m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result7)
AIC(result7)

# Using pc assigned to nearest station (distance) - unweighted population - 4 min work pop
result8 <- lm(formula = log(entex1112) ~ log1p(n1d_totalp) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_4m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result8)
AIC(result8)




# Using pc assigned to nearest station (time) - weighted population - 1 min work pop

result9 <- lm(formula = log(entex1112) ~ log1p(wpop2432) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result9)
AIC(result9)

result10 <- lm(formula = log(entex1112) ~ log1p(wpop2432) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result10)
AIC(result10)



# Using pc assigned to nearest station (distance) - distance weighted population - 1 min work pop

result11 <- lm(formula = log(entex1112) ~ log1p(wpop15212) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result11)
AIC(result11)


result12 <- lm(formula = log(entex1112) ~ log1p(wpop15212) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result12)
AIC(result12)

# Using probability and distance-weighted population - combined model (no added mains) - 1 min work pop


result13 <- lm(formula = log(entex1112) ~ log(te19cmb_15212) + log(dailyfrequency_2013_all) + log(nr_catad_km) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result13)
AIC(result13)

# remove nr_catad_km
result14 <- lm(formula = log(entex1112) ~ log(te19cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result14)
AIC(result14)

## February 2019 using revised distance decay function
# remove nr_catad_km
result14a <- lm(formula = log(entex1112) ~ log(te19cmb_15212_adj) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result14a)
AIC(result14a)




CVlm(data=catef_te_models_ews, form.lm=formula(log(entex1112) ~ log(te19cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy), m= 10, plotit = c("Residual"), main="Small symbols show cross-validation predicted values", legend.pos="topleft", printit = TRUE, dots = FALSE)


# remove nr_catad_km
result15 <- lm(formula = log(entex1112) ~ log(te19cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result15)
AIC(result15)


# Using probability and time-weighted population - combined model (no added mains) - 1 min work pop
# removed nr_catad_km_ln

result16 <- lm(formula = log(entex1112) ~ log(te19cmb_2432) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy, data = catef_te_models_ews)

summary(result16)
AIC(result16)


result17 <- lm(formula = log(entex1112) ~ log(te19cmb_2432) + log(dailyfrequency_2013_all) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy, data = catef_te_models_ews)

summary(result17)
AIC(result17)

# Using te24 - include accessibility term, and distance weighting

result18 <- lm(formula = log(entex1112) ~ log(te24cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy, data = catef_te_models_ews)

summary(result18)
AIC(result18)


# Using te24 - include accessibility term, and distance weighting, 2 minute workplace

result18a <- lm(formula = log(entex1112) ~ log(te24cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_2m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy, data = catef_te_models_ews)

summary(result18a)
AIC(result18a)


# Using probability and distance-weighted population - combined model (WITH added mains stations) - 1 min work pop


result19 <- lm(formula = log(entex1112) ~ log(te12cmbmn_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + tcard_bound_dummy + TerminusDummy, data = catef_te_models_ews)

summary(result19)
AIC(result19)


# pure trip rate

result20 <- lm(formula = log(entex1112) ~ log1p(wpop15212) + log1p(work_pop_3m), data = catef_te_models_ews)
summary(result20)
AIC(result20)

result21 <- lm(formula = log(entex1112) ~ log1p(wpop2432) + log1p(work_pop_3m), data = catef_te_models_ews)
summary(result21)
AIC(result21)

result22 <- lm(formula = log(entex1112) ~ log(te19cmb_15212) + log1p(work_pop_3m), data = catef_te_models_ews)
summary(result22)
AIC(result22)

result23 <- lm(formula = log(entex1112) ~ log(te19cmb_2432) + log1p(work_pop_3m), data = catef_te_models_ews)
summary(result23)
AIC(result23)

result24 <- lm(formula = log(entex1112) ~ log(te12cmbmn_15212) + log1p(work_pop_3m), data = catef_te_models_ews)
summary(result24)
AIC(result24)



# standardized residuals - model 8
model8.lm <- result14
model8.stdres = rstandard(model8.lm)

#We now plot the standardized residual against the observed values of the variable waiting.
plot(log(catef_te_models_ews$entex1112), model8.stdres, 
     ylab="Standardised residuals", 
     xlab="ln(entries/exits)", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15
) 
abline(0, 0)                  # the horizon 


# standardized residuals - model 9
model9.lm <- result18
model9.stdres = rstandard(model9.lm)

# create dataframe to plot residuals in GIS

model9_residuals <- data.frame(crscode = catef_te_models_ews$crscode, easting = catef_te_models_ews$easting, northing = catef_te_models_ews$northing, residual = model9.lm$residuals, stdres = model9.stdres)

write.csv(file = "demand_models/blainey_models/postcodes/revised_models/model9_residuals.csv", model9_residuals, row.names = FALSE)

#We now plot the standardized residual against the observed values of the variable waiting.
plot(log(catef_te_models_ews$entex1112), model9.stdres, 
     ylab="Standardised residuals", 
     xlab="ln(entries/exits)", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15
)
abline(0, 0)                  # the horizon 

model9.lm$model$`log(te24cmb_15212)`


#We now plot the standardized residual against the probability and decay weighted catchment population
plot(exp(model9.lm$model$`log(te24cmb_15212)`), model9.stdres, 
     ylab="Standardised residuals", 
     xlab="Weighted catchment population", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15,
     xaxp  = c(0, 16500, 33),
     cex.axis = 0.6
)
#abline(0, 0)                  # the horizon 
abline(0, 0)                  # the horizon 
abline(-1.75, 0)                  # the horizon 
abline(v = 250)                  # the horizon 
# Indicates that over and under prediction becomes greater at small weighted catchment population

low_pop <- data.frame(ln_te24cmb15212 = model9.lm$model$`log(te24cmb_15212)`, stdres = model9.stdres)
low_pop <- filter(low_pop, exp(ln_te24cmb15212) <= 5000)

# look at under 5000 only
plot(exp(low_pop$ln_te24cmb15212), low_pop$stdres, 
     ylab="Standardised residuals", 
     xlab="Weighted catchment population (<= 5000)", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15,
     xaxp  = c(0, 5000, 50),
     cex.axis = 0.6
)
#abline(0, 0)                  # the horizon 
abline(2, 0)                  # the horizon 
abline(-2, 0)                  # the horizon 
abline(v = 150)                  # the horizon 
# Indicates that over and under prediction becomes greater at small weighted catchment population



# standardized residuals - model 10
model10.lm <- result19
model10.stdres = rstandard(model10.lm)

#We now plot the standardized residual against the observed values of the variable waiting.
plot(log(catef_te_models_ews$entex1112), model10.stdres, 
     ylab="Standardised residuals", 
     xlab="ln(entries/exits)", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15
) 
abline(0, 0)                  # the horizon 


# standardized residuals - model 7
model7.lm <- result11
model7.stdres = rstandard(model7.lm)

#We now plot the standardized residual against the observed values of the variable waiting.
plot(log(catef_te_models_ews$entex1112), model7.stdres, 
     ylab="Standardised residuals", 
     xlab="ln(entries/exits)", 
     main=NULL,
     col = '#fc8d6288',
     pch = 15
) 
abline(0, 0)                  # the horizon 




sink(file = "demand_models/blainey_models/postcodes/revised_models/model-output.txt", append = FALSE, type = c("output", "message"),
     split = TRUE)



## Cross validation

#test using cv.glm
result18.glm <- glm(formula = log(entex1112) ~ log(te24cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy, data = catef_te_models_ews)
summary(result18.glm)

cv.err=cv.glm(catef_te_models_ews,
              result18.glm, K=10)$delta[1]

cv.err=cv.glm(catef_te_models_ews,
              result18.glm, K=10)$delta[2]

library("DAAG")
x <- .Random.seed
sink(file = "demand_models/blainey_models/postcodes/revised_models/cross-validation/cvmodel9.txt", append = TRUE, type = c("output", "message"), split = TRUE)

# model 9

# Evaluate models uses k-fold cross-validation

cvsummary <- data.frame( cvrep = character(),
                                 fold1 = numeric(),
                                 fold2 = numeric(),
                         fold3 = numeric(),
                         fold4 = numeric(),
                         fold5 = numeric(),
                         fold6 = numeric(),
                         fold7 = numeric(),
                         fold8 = numeric(),
                         fold9 = numeric(),
                         fold10 = numeric(),
                                 average = numeric(), stringsAsFactors = F
)

# 10 repeats of 10 folds

for (i in 1:10) {
 
# set repeat name 
rep <- paste0("rep", i)

# run the model, 10 folds
model9cv <- CVlm_my(data=catef_te_models_ews, form.lm=formula(log(entex1112) ~ log(te24cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy), m= 10, plotit = F, main= NA, legend.pos="topleft", printit = TRUE, dots = F, seed = x[i])

# calculate residuals
model9cv[,"residual"] <- model9cv$`log(entex1112)` - model9cv$cvpred
model9cv[,"sqrresidual"] <- model9cv[,"residual"]^2

# summarise mse and rmse
summary_mse <- model9cv %>%
               group_by(fold) %>%
               summarise(mse = mean(sqrresidual), rmse = sqrt(mse))

# add to summary table
newRow <- data.frame(cvrep = rep, t(summary_mse %>% select(mse)), mean = mean(summary_mse$mse), stringsAsFactors = F, row.names = NULL)
cvsummary[nrow(cvsummary)+1,] <- newRow

}

cvsummary <- cvsummary %>% mutate_if(is.numeric, round, 3)

save(cvsummary, file="demand_models/blainey_models/postcodes/revised_models/cross-validation/cvsummary.Rda")
write.csv(cvsummary, file="demand_models/blainey_models/postcodes/revised_models/cross-validation/cvsummary.csv")

#to plot
model9cv <- CVlm_my(data=catef_te_models_ews, form.lm=formula(ln(entex1112) ~ log(te24cmb_15212) + log(dailyfrequency_2013_all) + log1p(work_pop_1m) + log1p(carspaces) + electric_dummy + TerminusDummy + tcard_bound_dummy), m=10, plotit = T, main= NA, legend.pos="topleft", printit = TRUE, dots = T, seed = x[1])

## sum of squares
sum(model9cv$residual^2)

## mean of sum of squares
(sum(model9cv$residual^2))/nrow(model9cv)

## mean error
(sum(model9cv$residual))/nrow(model9cv)

# mean percentage error
(sum(model9cv$residual/model9cv$`log(entex1112)`)*100)/nrow(model9cv)

# mean absolute percentage error
(sum(abs(model9cv$residual/model9cv$`log(entex1112)`))*100)/nrow(model9cv)

## mean absolute error
(sum(abs(model9cv$residual)))/nrow(model9cv)

##Root mean squared error (RMSE)
sqrt((sum(model9cv$residual^2))/nrow(model9cv))

#sum of (lnQ)^2

sum(log(model9cv$`log(entex1112)`/model9cv$cvpred)^2)



